#set the seed
set.seed(20020718)
#timing of script
# What time did we start running this script?
start_time <- Sys.time()
start_time
## [1] "2025-03-04 16:52:14 EST"
#load libraries
# Efficient package loading with pacman
# Don't forget to install pacman and DT if you don't have it yet. :)
pacman::p_load(tidyverse, dada2, phyloseq, patchwork, DT, devtools, install = FALSE)
#reading raw sequencing files
# Set the raw fastq path to the raw sequencing files
# Path to the fastq files
raw_fastqs_path <- "data/01_DADA2/01_raw_gzipped_fastqs"
raw_fastqs_path
## [1] "data/01_DADA2/01_raw_gzipped_fastqs"
# What files are in this path? Intuition Check
head(list.files(raw_fastqs_path))
## [1] "SRR13675761_1.fastq.gz" "SRR13675761_2.fastq.gz" "SRR13675762_1.fastq.gz"
## [4] "SRR13675762_2.fastq.gz" "SRR13675763_1.fastq.gz" "SRR13675763_2.fastq.gz"
# How many files are there?
length(list.files(raw_fastqs_path))
## [1] 372
# Create vector of forward reads
forward_reads <- list.files(raw_fastqs_path, pattern = "1.fastq.gz", full.names = TRUE)
# Intuition Checks
head(forward_reads)
## [1] "data/01_DADA2/01_raw_gzipped_fastqs/SRR13675761_1.fastq.gz"
## [2] "data/01_DADA2/01_raw_gzipped_fastqs/SRR13675762_1.fastq.gz"
## [3] "data/01_DADA2/01_raw_gzipped_fastqs/SRR13675763_1.fastq.gz"
## [4] "data/01_DADA2/01_raw_gzipped_fastqs/SRR13675764_1.fastq.gz"
## [5] "data/01_DADA2/01_raw_gzipped_fastqs/SRR13675765_1.fastq.gz"
## [6] "data/01_DADA2/01_raw_gzipped_fastqs/SRR13675766_1.fastq.gz"
# Intuition check #2: We should have fewer reads in the forward vector than in total
stopifnot(length(forward_reads) < length(list.files(raw_fastqs_path)))
# Create a vector of reverse reads
reverse_reads <- list.files(raw_fastqs_path, pattern = "2.fastq.gz", full.names = TRUE)
# Intuition Checks
head(reverse_reads)
## [1] "data/01_DADA2/01_raw_gzipped_fastqs/SRR13675761_2.fastq.gz"
## [2] "data/01_DADA2/01_raw_gzipped_fastqs/SRR13675762_2.fastq.gz"
## [3] "data/01_DADA2/01_raw_gzipped_fastqs/SRR13675763_2.fastq.gz"
## [4] "data/01_DADA2/01_raw_gzipped_fastqs/SRR13675764_2.fastq.gz"
## [5] "data/01_DADA2/01_raw_gzipped_fastqs/SRR13675765_2.fastq.gz"
## [6] "data/01_DADA2/01_raw_gzipped_fastqs/SRR13675766_2.fastq.gz"
# Intuition check #2: Need to have equal number of forward and reverse files
stopifnot(length(reverse_reads) == length(forward_reads))
Let’s see the quality of the raw reads before we trim
# Randomly select 12 samples from dataset to evaluate
# Selecting 12 is typically better than 2 (like we did in class for efficiency)
random_samples <- sample(1:length(reverse_reads), size = 12)
random_samples
## [1] 177 113 105 12 90 55 98 121 60 106 157 23
# Calculate and plot quality of these two samples
forward_filteredQual_plot_12 <- plotQualityProfile(forward_reads[random_samples]) +
labs(title = "Forward Read: Raw Quality")+
guides(scale = "none")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the dada2 package.
## Please report the issue at <https://github.com/benjjneb/dada2/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
forward_filteredQual_plot_12
## Warning: Removed 3600 rows containing missing values or values outside the scale range
## (`geom_tile()`).
reverse_filteredQual_plot_12 <- plotQualityProfile(reverse_reads[random_samples]) +
labs(title = "Reverse Read: Raw Quality")+
guides(scale = "none")
# Plot them together with patchwork
forward_filteredQual_plot_12 + reverse_filteredQual_plot_12
## Warning: Removed 3600 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 3600 rows containing missing values or values outside the scale range
## (`geom_tile()`).
Next, we will plot all of the samples aggregated into one forward (left) and one reverse read (right) plot.
# Aggregate all QC plots
# Forward reads
forward_preQC_plot <-
plotQualityProfile(forward_reads, aggregate = TRUE) +
labs(title = "Forward Pre-QC")+
guides(scale = "none")
# reverse reads
reverse_preQC_plot <-
plotQualityProfile(reverse_reads, aggregate = TRUE) +
labs(title = "Reverse Pre-QC")+
guides(scale = "none")
# Now, let's put the two plots together
preQC_aggregate_plot <-
# Plot the forward and reverse together
forward_preQC_plot + reverse_preQC_plot
# Show the plot
preQC_aggregate_plot
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Removed 300 rows containing missing values or values outside the scale range
## (`geom_tile()`).
INTERPRETATION #1 of Pre-QC: Here, in this location of your analyses, please insert a description of the interpretation you draw regarding your plots. You must note anything important about the figure you’ve created about your data including any spikes in quality and also the general trend of the raw QC plots. An example is provided below.
Above are plots showing the pre-QC quality scores of the raw sequencing data for the project. We can draw a few conclusions from the plots above, which were generated with Illumina MiSeq v3 chemistry with 300-bp paired end:
truncLen = 230 to remove the last 70 base pairs of every
read to keep quality scores at least above 20#prepare a placeholder for filtered reads
# Create vector of sample names from the filenames
sample_names <- sapply(strsplit(basename(forward_reads), "_"), `[`,1)
# Intuition Check
head(sample_names)
## [1] "SRR13675761" "SRR13675762" "SRR13675763" "SRR13675764" "SRR13675765"
## [6] "SRR13675766"
# Place filtered reads into filtered_fastqs_path
filtered_fastqs_path <- "data/01_DADA2/02_filtered_fastqs"
# Intuition Check
filtered_fastqs_path
## [1] "data/01_DADA2/02_filtered_fastqs"
# create 2 vectors: filtered_forward_reads & filtered_reverse_reads
filtered_forward_reads <-
file.path(filtered_fastqs_path, paste0(sample_names, "_R1_filtered.fastq.gz"))
# Intuition Check
length(filtered_forward_reads)
## [1] 186
# reverse reads
filtered_reverse_reads <-
file.path(filtered_fastqs_path, paste0(sample_names, "_R2_filtered.fastq.gz"))
# Intuition Check
head(filtered_reverse_reads)
## [1] "data/01_DADA2/02_filtered_fastqs/SRR13675761_R2_filtered.fastq.gz"
## [2] "data/01_DADA2/02_filtered_fastqs/SRR13675762_R2_filtered.fastq.gz"
## [3] "data/01_DADA2/02_filtered_fastqs/SRR13675763_R2_filtered.fastq.gz"
## [4] "data/01_DADA2/02_filtered_fastqs/SRR13675764_R2_filtered.fastq.gz"
## [5] "data/01_DADA2/02_filtered_fastqs/SRR13675765_R2_filtered.fastq.gz"
## [6] "data/01_DADA2/02_filtered_fastqs/SRR13675766_R2_filtered.fastq.gz"
#filter and trim
filtered_reads <-
filterAndTrim(forward_reads, filtered_forward_reads,
reverse_reads, filtered_reverse_reads,
truncLen = c(240,230), trimLeft = c(46,55),
maxN = 0, maxEE = c(2,2), truncQ = 2,
rm.phix = TRUE, compress = TRUE,
multithread = 10)
#assess trimming quality
# Plot the 12 random samples after QC
forward_filteredQual_plot_12 <-
plotQualityProfile(filtered_forward_reads[random_samples]) +
labs(title = "Trimmed Forward Read Quality")+
guides(scale = "none")
reverse_filteredQual_plot_12 <-
plotQualityProfile(filtered_reverse_reads[random_samples]) +
labs(title = "Trimmed Reverse Read Quality")+
guides(scale = "none")
# Put the two plots together
forward_filteredQual_plot_12 + reverse_filteredQual_plot_12
# Aggregate all QC plots
# Forward reads
forward_postQC_plot <-
plotQualityProfile(filtered_forward_reads, aggregate = TRUE) +
labs(title = "Forward Post-QC")+
guides(scale = "none")
# reverse reads
reverse_postQC_plot <-
plotQualityProfile(filtered_reverse_reads, aggregate = TRUE) +
labs(title = "Reverse Post-QC")+
guides(scale = "none")
# Now, let's put the two plots together
postQC_aggregate_plot <-
# Plot the forward and reverse together
forward_postQC_plot + reverse_postQC_plot
# Show the plot
postQC_aggregate_plot
# Make output into dataframe
filtered_df <- as.data.frame(filtered_reads) %>%
mutate(percent.retained = reads.out/reads.in)
# Intuition check
# Visualize it in table format
DT::datatable(filtered_df)
# Let's calculate some statistics
read_stats_df <-
filtered_df %>%
reframe(median_reads_in = median(reads.in),
median_reads_out = median(reads.out),
median_percent_retained = (median(reads.out)/median(reads.in)),
max_percent_retained = max(reads.out/reads.in),
min_percent_retained = min(reads.out/reads.in))
# Take a look at it!
read_stats_df
## median_reads_in median_reads_out median_percent_retained max_percent_retained
## 1 121481.5 85830.5 0.7065314 0.9590911
## min_percent_retained
## 1 0.02417795
# Plot it
numSeqs_QC_dotplot <-
filtered_df %>%
ggplot(aes(x = reads.in, y = reads.out)) +
geom_point(alpha = 0.5, size = 2) +
labs(x = "# of Raw Seqs",
y = "# of Seqs Retained") +
# Now let's add a 1:1 line for reference of keeping 100% of the reads
geom_abline(slope=1, intercept = 0, color = "deeppink")
# Now, let's look at the number of reads retained in a histogram
numRetained_QC_histplot <-
filtered_df %>%
ggplot(aes(x = reads.out)) +
geom_histogram() +
labs(x = "# of Seqs Retained",
y = "# of Samples")
# Create a histogram of percent reads retained in a histogram
percSeqs_QC_histplot <-
filtered_df %>%
ggplot(aes(x = percent.retained)) +
geom_histogram() +
labs(x = "% of Seqs Retained",
y = "# of Samples") +
# Set the scale to be between 0-1 (0-100%)
scale_x_continuous(limits = c(0, 1))
# Now, let's put the plots together
numSeqs_QC_dotplot + numRetained_QC_histplot + percSeqs_QC_histplot +
plot_annotation(tag_levels = 'A')
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
INTERPRETATION #3 of Post-QC Statistics:
Here, in this location of your analyses, please insert a description of
the interpretation you draw regarding your read retainment pre- and
post-QC. Please note anything important about the three paneled figure
you created above. Think about how many reads got through? Is it
“enough”? Should I play with the parameters in
filterAndTrim() more? If so, which parameters? An example
interpretation is provided below.
This figure presents three panels showing how many sequences were retained after quality filtering and trimming in the DADA2 pipeline. Let’s break down each panel:
Panel A: Scatter Plot of Raw vs. Retained Sequences:
Interpretation of Panel A:
Panel B: Histogram of the Number of Sequences Retained per Sample
Interpretation of Panel B
- The majority of samples have between ~5,000 and 10,000 retained
sequences, which suggests good filtering efficiency. - Around 9 samples
retained no reads and will need to be removed. This is expected as from
the quality plots of 12 random samples, some samples have a very poor
quality score throughout.
Panel C: Histogram of Percent of Sequences Retained
Interpretation of Panel C.
Consider re-running your filterAndTrim()
if:
maxEE (expected errors) or adjusting truncation lengths
(truncLen).# Plot the pre and post together in one plot
preQC_aggregate_plot / postQC_aggregate_plot
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Removed 300 rows containing missing values or values outside the scale range
## (`geom_tile()`).
INTERPRETATION #4 is comparing your pre-QC to your post-QC read quality. Here, in this location of your analyses, please insert a description of the interpretation you draw from yor overall quality control results. Are you satisfied with your QC process? An example interpretation is provided below.
Quality Score Improvements
# Take the time now that we are at the end of the script
end_time <- Sys.time()
end_time
## [1] "2025-03-04 17:42:08 EST"
# Echo the elapsed time
elapsed_time <- round((end_time - start_time), 3)
elapsed_time
## Time difference of 49.908 mins
#session information
# Ensure reproducibility
devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.2.3 (2023-03-15)
## os Rocky Linux 9.4 (Blue Onyx)
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/New_York
## date 2025-03-04
## pandoc 3.1.1 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## ade4 1.7-23 2025-02-14 [1] CRAN (R 4.2.3)
## ape 5.8-1 2024-12-16 [1] CRAN (R 4.2.3)
## Biobase 2.58.0 2022-11-01 [2] Bioconductor
## BiocGenerics 0.44.0 2022-11-01 [2] Bioconductor
## BiocParallel 1.32.6 2023-03-17 [2] Bioconductor
## biomformat 1.26.0 2022-11-01 [1] Bioconductor
## Biostrings 2.66.0 2022-11-01 [2] Bioconductor
## bitops 1.0-9 2024-10-03 [2] CRAN (R 4.2.3)
## bslib 0.8.0 2024-07-29 [2] CRAN (R 4.2.3)
## cachem 1.1.0 2024-05-16 [2] CRAN (R 4.2.3)
## cli 3.6.4 2025-02-13 [1] CRAN (R 4.2.3)
## cluster 2.1.4 2022-08-22 [2] CRAN (R 4.2.3)
## codetools 0.2-19 2023-02-01 [2] CRAN (R 4.2.3)
## colorspace 2.1-1 2024-07-26 [2] CRAN (R 4.2.3)
## crayon 1.5.3 2024-06-20 [2] CRAN (R 4.2.3)
## crosstalk 1.2.1 2023-11-23 [2] CRAN (R 4.2.3)
## dada2 * 1.26.0 2022-11-01 [1] Bioconductor
## data.table 1.16.2 2024-10-10 [2] CRAN (R 4.2.3)
## DelayedArray 0.24.0 2022-11-01 [2] Bioconductor
## deldir 2.0-4 2024-02-28 [2] CRAN (R 4.2.3)
## devtools * 2.4.5 2022-10-11 [1] CRAN (R 4.2.3)
## digest 0.6.37 2024-08-19 [2] CRAN (R 4.2.3)
## dplyr * 1.1.4 2023-11-17 [2] CRAN (R 4.2.3)
## DT * 0.33 2024-04-04 [1] CRAN (R 4.2.3)
## ellipsis 0.3.2 2021-04-29 [2] CRAN (R 4.2.3)
## evaluate 1.0.1 2024-10-10 [2] CRAN (R 4.2.3)
## farver 2.1.2 2024-05-13 [2] CRAN (R 4.2.3)
## fastmap 1.2.0 2024-05-15 [2] CRAN (R 4.2.3)
## forcats * 1.0.0 2023-01-29 [2] CRAN (R 4.2.3)
## foreach 1.5.2 2022-02-02 [1] CRAN (R 4.2.3)
## fs 1.6.5 2024-10-30 [2] CRAN (R 4.2.3)
## generics 0.1.3 2022-07-05 [2] CRAN (R 4.2.3)
## GenomeInfoDb 1.34.9 2023-02-02 [2] Bioconductor
## GenomeInfoDbData 1.2.9 2024-11-25 [2] Bioconductor
## GenomicAlignments 1.34.1 2023-03-09 [1] Bioconductor
## GenomicRanges 1.50.2 2022-12-16 [2] Bioconductor
## ggplot2 * 3.5.1 2024-04-23 [2] CRAN (R 4.2.3)
## glue 1.8.0 2024-09-30 [2] CRAN (R 4.2.3)
## gtable 0.3.6 2024-10-25 [2] CRAN (R 4.2.3)
## hms 1.1.3 2023-03-21 [2] CRAN (R 4.2.3)
## htmltools 0.5.8.1 2024-04-04 [2] CRAN (R 4.2.3)
## htmlwidgets 1.6.4 2023-12-06 [2] CRAN (R 4.2.3)
## httpuv 1.6.15 2024-03-26 [2] CRAN (R 4.2.3)
## hwriter 1.3.2.1 2022-04-08 [1] CRAN (R 4.2.3)
## igraph 2.1.1 2024-10-19 [2] CRAN (R 4.2.3)
## interp 1.1-6 2024-01-26 [1] CRAN (R 4.2.3)
## IRanges 2.32.0 2022-11-01 [2] Bioconductor
## iterators 1.0.14 2022-02-05 [1] CRAN (R 4.2.3)
## jpeg 0.1-10 2022-11-29 [1] CRAN (R 4.2.3)
## jquerylib 0.1.4 2021-04-26 [2] CRAN (R 4.2.3)
## jsonlite 1.8.9 2024-09-20 [2] CRAN (R 4.2.3)
## knitr 1.49 2024-11-08 [2] CRAN (R 4.2.3)
## labeling 0.4.3 2023-08-29 [2] CRAN (R 4.2.3)
## later 1.3.2 2023-12-06 [2] CRAN (R 4.2.3)
## lattice 0.20-45 2021-09-22 [2] CRAN (R 4.2.3)
## latticeExtra 0.6-30 2022-07-04 [1] CRAN (R 4.2.3)
## lifecycle 1.0.4 2023-11-07 [2] CRAN (R 4.2.3)
## lubridate * 1.9.3 2023-09-27 [2] CRAN (R 4.2.3)
## magrittr 2.0.3 2022-03-30 [2] CRAN (R 4.2.3)
## MASS 7.3-58.2 2023-01-23 [2] CRAN (R 4.2.3)
## Matrix 1.6-4 2023-11-30 [2] CRAN (R 4.2.3)
## MatrixGenerics 1.10.0 2022-11-01 [2] Bioconductor
## matrixStats 1.4.1 2024-09-08 [2] CRAN (R 4.2.3)
## memoise 2.0.1 2021-11-26 [2] CRAN (R 4.2.3)
## mgcv 1.8-42 2023-03-02 [2] CRAN (R 4.2.3)
## mime 0.12 2021-09-28 [2] CRAN (R 4.2.3)
## miniUI 0.1.1.1 2018-05-18 [2] CRAN (R 4.2.3)
## multtest 2.54.0 2022-11-01 [1] Bioconductor
## munsell 0.5.1 2024-04-01 [2] CRAN (R 4.2.3)
## nlme 3.1-162 2023-01-31 [2] CRAN (R 4.2.3)
## pacman 0.5.1 2019-03-11 [1] CRAN (R 4.2.3)
## patchwork * 1.3.0.9000 2025-02-19 [1] Github (thomasp85/patchwork@2695a9f)
## permute 0.9-7 2022-01-27 [1] CRAN (R 4.2.3)
## phyloseq * 1.42.0 2022-11-01 [1] Bioconductor
## pillar 1.10.1 2025-01-07 [1] CRAN (R 4.2.3)
## pkgbuild 1.4.5 2024-10-28 [2] CRAN (R 4.2.3)
## pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 4.2.3)
## pkgload 1.4.0 2024-06-28 [2] CRAN (R 4.2.3)
## plyr 1.8.9 2023-10-02 [2] CRAN (R 4.2.3)
## png 0.1-8 2022-11-29 [2] CRAN (R 4.2.3)
## profvis 0.4.0 2024-09-20 [2] CRAN (R 4.2.3)
## promises 1.3.0 2024-04-05 [2] CRAN (R 4.2.3)
## purrr * 1.0.2 2023-08-10 [2] CRAN (R 4.2.3)
## R6 2.6.1 2025-02-15 [1] CRAN (R 4.2.3)
## RColorBrewer 1.1-3 2022-04-03 [2] CRAN (R 4.2.3)
## Rcpp * 1.0.13-1 2024-11-02 [2] CRAN (R 4.2.3)
## RcppParallel 5.1.10 2025-01-24 [1] CRAN (R 4.2.3)
## RCurl 1.98-1.16 2024-07-11 [2] CRAN (R 4.2.3)
## readr * 2.1.5 2024-01-10 [2] CRAN (R 4.2.3)
## remotes 2.5.0 2024-03-17 [2] CRAN (R 4.2.3)
## reshape2 1.4.4 2020-04-09 [2] CRAN (R 4.2.3)
## rhdf5 2.42.1 2023-04-07 [1] Bioconductor
## rhdf5filters 1.10.1 2023-03-24 [1] Bioconductor
## Rhdf5lib 1.20.0 2022-11-01 [1] Bioconductor
## rlang 1.1.5 2025-01-17 [1] CRAN (R 4.2.3)
## rmarkdown 2.29 2024-11-04 [2] CRAN (R 4.2.3)
## Rsamtools 2.14.0 2022-11-01 [1] Bioconductor
## rstudioapi 0.17.1 2024-10-22 [2] CRAN (R 4.2.3)
## S4Vectors 0.36.2 2023-02-26 [2] Bioconductor
## sass 0.4.9 2024-03-15 [2] CRAN (R 4.2.3)
## scales 1.3.0 2023-11-28 [2] CRAN (R 4.2.3)
## sessioninfo 1.2.2 2021-12-06 [2] CRAN (R 4.2.3)
## shiny 1.9.1 2024-08-01 [2] CRAN (R 4.2.3)
## ShortRead 1.56.1 2022-11-18 [1] Bioconductor
## stringi 1.8.4 2024-05-06 [2] CRAN (R 4.2.3)
## stringr * 1.5.1 2023-11-14 [2] CRAN (R 4.2.3)
## SummarizedExperiment 1.28.0 2022-11-01 [2] Bioconductor
## survival 3.5-3 2023-02-12 [2] CRAN (R 4.2.3)
## tibble * 3.2.1 2023-03-20 [2] CRAN (R 4.2.3)
## tidyr * 1.3.1 2024-01-24 [2] CRAN (R 4.2.3)
## tidyselect 1.2.1 2024-03-11 [2] CRAN (R 4.2.3)
## tidyverse * 2.0.0 2023-02-22 [2] CRAN (R 4.2.3)
## timechange 0.3.0 2024-01-18 [2] CRAN (R 4.2.3)
## tzdb 0.4.0 2023-05-12 [2] CRAN (R 4.2.3)
## urlchecker 1.0.1 2021-11-30 [2] CRAN (R 4.2.3)
## usethis * 3.0.0 2024-07-29 [2] CRAN (R 4.2.3)
## vctrs 0.6.5 2023-12-01 [2] CRAN (R 4.2.3)
## vegan 2.6-10 2025-01-29 [1] CRAN (R 4.2.3)
## withr 3.0.2 2024-10-28 [2] CRAN (R 4.2.3)
## xfun 0.49 2024-10-31 [2] CRAN (R 4.2.3)
## xtable 1.8-4 2019-04-21 [2] CRAN (R 4.2.3)
## XVector 0.38.0 2022-11-01 [2] Bioconductor
## yaml 2.3.10 2024-07-26 [2] CRAN (R 4.2.3)
## zlibbioc 1.44.0 2022-11-01 [2] Bioconductor
##
## [1] /home/zx54/R/x86_64-pc-linux-gnu-library/4.2
## [2] /programs/R-4.2.3/lib64/R/library
##
## ──────────────────────────────────────────────────────────────────────────────